{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Policy Iteration in python "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def policy_evaluation(policy, environment, discount_factor=1.0, theta=1e-9, max_iterations=1e9):\n",
    "        # Number of evaluation iterations\n",
    "        evaluation_iterations = 1\n",
    "        # Initialize a value function for each state as zero\n",
    "        V = np.zeros(environment.nS)\n",
    "        # Repeat until change in value is below the threshold\n",
    "        for i in range(int(max_iterations)):\n",
    "                # Initialize a change of value function as zero\n",
    "                delta = 0\n",
    "                # Iterate though each state\n",
    "                for state in range(environment.nS):\n",
    "                       # Initial a new value of current state\n",
    "                       v = 0\n",
    "                       # Try all possible actions which can be taken from this state\n",
    "                       for action, action_probability in enumerate(policy[state]):\n",
    "                             # Check how good next state will be\n",
    "                             for state_probability, next_state, reward, terminated in environment.P[state][action]:\n",
    "                                  # Calculate the expected value\n",
    "                                  v += action_probability * state_probability * (reward + discount_factor * V[next_state])\n",
    "                       \n",
    "                       # Calculate the absolute change of value function\n",
    "                       delta = max(delta, np.abs(V[state] - v))\n",
    "                       # Update value function\n",
    "                       V[state] = v\n",
    "                evaluation_iterations += 1\n",
    "                \n",
    "                # Terminate if value change is insignificant\n",
    "                if delta < theta:\n",
    "                        print(f'Policy evaluated in {evaluation_iterations} iterations.')\n",
    "                        return V"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def one_step_lookahead(environment, state, V, discount_factor):\n",
    "        action_values = np.zeros(environment.nA)\n",
    "        for action in range(environment.nA):\n",
    "                for probability, next_state, reward, terminated in environment.P[state][action]:\n",
    "                        action_values[action] += probability * (reward + discount_factor * V[next_state])\n",
    "        return action_values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def policy_iteration(environment, discount_factor=1.0, max_iterations=1e9):\n",
    "        # Start with a random policy\n",
    "        #num states x num actions / num actions\n",
    "        policy = np.ones([environment.nS, environment.nA]) / environment.nA\n",
    "        # Initialize counter of evaluated policies\n",
    "        evaluated_policies = 1\n",
    "        # Repeat until convergence or critical number of iterations reached\n",
    "        for i in range(int(max_iterations)):\n",
    "                stable_policy = True\n",
    "                # Evaluate current policy\n",
    "                V = policy_evaluation(policy, environment, discount_factor=discount_factor)\n",
    "                # Go through each state and try to improve actions that were taken (policy Improvement)\n",
    "                for state in range(environment.nS):\n",
    "                        # Choose the best action in a current state under current policy\n",
    "                        current_action = np.argmax(policy[state])\n",
    "                        # Look one step ahead and evaluate if current action is optimal\n",
    "                        # We will try every possible action in a current state\n",
    "                        action_value = one_step_lookahead(environment, state, V, discount_factor)\n",
    "                        # Select a better action\n",
    "                        best_action = np.argmax(action_value)\n",
    "                        # If action didn't change\n",
    "                        if current_action != best_action:\n",
    "                                stable_policy = True\n",
    "                                # Greedy policy update\n",
    "                                policy[state] = np.eye(environment.nA)[best_action]\n",
    "                evaluated_policies += 1\n",
    "                # If the algorithm converged and policy is not changing anymore, then return final policy and value function\n",
    "                if stable_policy:\n",
    "                        print(f'Evaluated {evaluated_policies} policies.')\n",
    "                        return policy, V"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Value Iteration in python "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def value_iteration(environment, discount_factor=1.0, theta=1e-9, max_iterations=1e9):\n",
    "        # Initialize state-value function with zeros for each environment state\n",
    "        V = np.zeros(environment.nS)\n",
    "        for i in range(int(max_iterationsations)):\n",
    "                # Early stopping condition\n",
    "                delta = 0\n",
    "                # Update each state\n",
    "                for state in range(environment.nS):\n",
    "                        # Do a one-step lookahead to calculate state-action values\n",
    "                        action_value = one_step_lookahead(environment, state, V, discount_factor)\n",
    "                        # Select best action to perform based on the highest state-action value\n",
    "                        best_action_value = np.max(action_value)\n",
    "                        # Calculate change in value\n",
    "                        delta = max(delta, np.abs(V[state] - best_action_value))\n",
    "                        # Update the value function for current state\n",
    "                        V[state] = best_action_value\n",
    "                        # Check if we can stop\n",
    "                if delta < theta:\n",
    "                        print(f'Value-iteration converged at iteration#{i}.')\n",
    "                        break\n",
    "\n",
    "        # Create a deterministic policy using the optimal value function\n",
    "        policy = np.zeros([environment.nS, environment.nA])\n",
    "        for state in range(environment.nS):\n",
    "                # One step lookahead to find the best action for this state\n",
    "                action_value = one_step_lookahead(environment, state, V, discount_factor)\n",
    "                # Select best action based on the highest state-action value\n",
    "                best_action = np.argmax(action_value)\n",
    "                # Update the policy to perform a better action at a current state\n",
    "                policy[state, best_action] = 1.0\n",
    "        return policy, V"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def play_episodes(environment, n_episodes, policy):\n",
    "        wins = 0\n",
    "        total_reward = 0\n",
    "        for episode in range(n_episodes):\n",
    "                terminated = False\n",
    "                state = environment.reset()\n",
    "                while not terminated:\n",
    "                        # Select best action to perform in a current state\n",
    "                        action = np.argmax(policy[state])\n",
    "                        # Perform an action an observe how environment acted in response\n",
    "                        next_state, reward, terminated, info = environment.step(action)\n",
    "                        # Summarize total reward\n",
    "                        total_reward += reward\n",
    "                        # Update current state\n",
    "                        state = next_state\n",
    "                        # Calculate number of wins over episodes\n",
    "                        if terminated and reward == 1.0:\n",
    "                                wins += 1\n",
    "        average_reward = total_reward / n_episodes\n",
    "        return wins, total_reward, average_reward\n",
    "\n",
    "# Number of episodes to play\n",
    "n_episodes = 10000\n",
    "# Functions to find best policy\n",
    "solvers = [('Policy Iteration', policy_iteration),\n",
    "           ('Value Iteration', value_iteration)]\n",
    "for iteration_name, iteration_func in solvers:\n",
    "        # Load a Frozen Lake environment\n",
    "        environment = gym.make('FrozenLake-v0')\n",
    "        # Search for an optimal policy using policy iteration\n",
    "        policy, V = iteration_func(environment.env)\n",
    "        # Apply best policy to the real environment\n",
    "        wins, total_reward, average_reward = play_episodes(environment, n_episodes, policy)\n",
    "        print(f'{iteration_name} :: number of wins over {n_episodes} episodes = {wins}')\n",
    "        print(f'{iteration_name} :: average reward over {n_episodes} episodes = {average_reward} \\n\\n')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
